[code]
knitr::opts_chunk$set(warning = FALSE, message = FALSE, fig.align='center')
# Start timing the document render
start_time <- Sys.time()This document is part of the Purvis Dev 2025 CRISPR editing outcomes analysis. It uses the CrispRVariants package to analyze CRISPR editing outcomes from paired-end sequencing data.
The following external tools are used in this analysis and must be installed and available in your PATH: bwa mem is used for alignment, and CrispRVariants is used to analyze the editing outcomes. samtools is used for BAM file processing.
knitr::opts_chunk$set(warning = FALSE, message = FALSE, fig.align='center')
# Start timing the document render
start_time <- Sys.time()library(CrispRVariants)
library(Rsamtools)
library(GenomicRanges)
library(GenomicFeatures)
library(Biostrings)
library(DT)
library(gridExtra)
source("scripts/qmd_embed_download.R")
source("scripts/renv_embed_download.R")
set.seed(87645893)
dir.create(params$outs_path, showWarnings = TRUE)# Load the sample metadata
sample_metadata <- read.csv(file.path(params$sample_metadata_file), header = TRUE)
datatable(sample_metadata)bwa_ref <- params$ref_genome
raw_data_dir <- params$data_path
bam_dir <- params$processed_data_path
# Get all fastq files in the raw data directory
fq_fnames <- list.files(raw_data_dir, pattern = "fastq.gz$", full.names = TRUE)
# For paired-end reads, group files by sample name (handling _R1_001/_R2_001 naming)
# Extract sample names by removing _R1_001/_R2_001 suffixes
sample_names <- unique(gsub("_R[12]_001", "", basename(fq_fnames)))
sample_names <- gsub("\\.fastq\\.gz$", "", sample_names)
bam_dir_full <- file.path(bam_dir)
dir.create(bam_dir_full, showWarnings = FALSE, recursive = TRUE)
# Process each sample
for(sample in sample_names) {
# Find R1 and R2 files for this sample
r1_file <- fq_fnames[grepl(paste0(sample, "_R1_001"), basename(fq_fnames))]
r2_file <- fq_fnames[grepl(paste0(sample, "_R2_001"), basename(fq_fnames))]
# Check if we have both R1 and R2
if(length(r1_file) == 1 && length(r2_file) == 1) {
# Define file names explicitly
unsorted_bam <- file.path(bam_dir_full, paste0(sample, "_unsorted.bam"))
sorted_bam <- file.path(bam_dir_full, paste0(sample, "_sorted.bam"))
# Skip if already processed
if(!file.exists(paste0(sorted_bam, ".bai"))) {
cmd <- paste0("bwa mem -t ", params$bwa_threads, " ", bwa_ref, " ", r1_file, " ", r2_file,
" | samtools view -Sb - > ", unsorted_bam)
message("Processing paired-end sample: ", sample)
message(cmd, "\n")
system(cmd)
# Sort and index - FIX: Remove .bam extension for sortBam()
message("Sorting and indexing: ", basename(sorted_bam))
sorted_bam_base <- tools::file_path_sans_ext(sorted_bam) # Remove .bam extension
sortBam(unsorted_bam, sorted_bam_base) # sortBam will add .bam automatically
indexBam(sorted_bam) # Index the final .bam file
unlink(unsorted_bam) # Remove unsorted BAM
message("Created: ", sorted_bam)
} else {
message("Already processed: ", sample)
}
} else {
warning("Could not find proper R1/R2 files for sample: ", sample)
message("R1 file found: ", length(r1_file), " - ", ifelse(length(r1_file) > 0, r1_file, "none"))
message("R2 file found: ", length(r2_file), " - ", ifelse(length(r2_file) > 0, r2_file, "none"))
}
}
# Get list of sorted BAM files for downstream analysis
sorted_bam_files <- list.files(bam_dir_full, pattern = "_sorted\\.bam$", full.names = TRUE)
message("Found ", length(sorted_bam_files), " sorted BAM files:")
for(bam_file in sorted_bam_files) {
message(" - ", bam_file)
message(" Index exists: ", file.exists(paste0(bam_file, ".bai")))
}# Define the target location and reference sequence as granges
target_location <- GRanges(
seqnames = params$target_chr,
ranges = IRanges(start = params$target_start, end = params$target_end),
strand = params$target_strand
)
# Extract reference sequence
reference_raw <- system(sprintf("samtools faidx %s %s:%s-%s",
params$ref_genome,
seqnames(target_location)[1],
start(target_location)[1],
end(target_location)[1]),
intern = TRUE)[[2]]
# Convert to DNAString and reverse complement
reference <- Biostrings::reverseComplement(Biostrings::DNAString(reference_raw))
# Check lengths match
target_width <- width(target_location)
ref_length <- nchar(reference_raw)
message("Target width: ", target_width)
message("Reference length: ", ref_length)
message("Reference sequence: ", reference_raw)
# If they don't match, adjust the target to match the reference
if(target_width != ref_length) {
message("Adjusting target location to match reference sequence length")
target_location <- GRanges(
seqnames = params$target_chr,
ranges = IRanges(start = params$target_start, end = params$target_start + ref_length - 1),
strand = params$target_strand
)
message("New target width: ", width(target_location))
}This object is created from the sorted BAM files and the target location. It will contain the read counts and variant information for each sample at the specified target region.
# Grab sorted bams from processed_data using the exact same pattern
sorted_bam_files <- list.files(params$processed_data_path, pattern = "_sorted\\.bam$", full.names = TRUE)
if(length(sorted_bam_files) == 0) {
stop("No sorted BAM files found in ", params$processed_data_path, ". Check alignment step.")
}
message("BAM files found for CrisprSet:")
for(bam_file in sorted_bam_files) {
message(" - ", basename(bam_file))
}
bam_sample_names <- gsub("_sorted\\.bam$", "", basename(sorted_bam_files))
message("Sample names: ", paste(bam_sample_names, collapse = ", "))
# Check all files exist and are indexed
for(bam_file in sorted_bam_files) {
if(!file.exists(bam_file)) {
stop("BAM file not found: ", bam_file)
}
if(!file.exists(paste0(bam_file, ".bai"))) {
message("Indexing missing for: ", basename(bam_file))
indexBam(bam_file)
}
}
crispr_set <- readsToTarget(sorted_bam_files,
target = target_location,
reference = reference,
names = bam_sample_names,
target.loc = params$target_loc_forward)
DT::datatable(variantCounts(crispr_set))DT::datatable(as.data.frame(consensusSeqs(crispr_set)))This section creates variant plots for the CRISPR editing outcomes. It uses the CrispRVariants package to visualize the variants detected in the target region across samples.
gtf_fname <- params$gtf_file
# Check if GTF file exists
if(file.exists(gtf_fname)) {
message("Creating TxDb from GTF file...")
txdb <- GenomicFeatures::makeTxDbFromGFF(gtf_fname, format = "gtf")
# Check if there are any genes in our target region
target_genes <- genes(txdb, filter = list(tx_chrom = seqnames(target_location)[1]))
overlapping_genes <- subsetByOverlaps(target_genes, target_location)
if(length(overlapping_genes) == 0) {
message("No genes found overlapping target region. Proceeding without gene annotation.")
txdb <- NULL
} else {
message("Found ", length(overlapping_genes), " genes overlapping target region")
print(overlapping_genes)
}
} else {
message("GTF file not found at: ", gtf_fname)
message("Proceeding without gene annotation")
txdb <- NULL
}group <- sample_metadata$treatment
plotVariants(crispr_set, txdb = txdb, gene.text.size = 8,
row.ht.ratio = c(1,8), col.wdth.ratio = c(4,2),
plotAlignments.args = list(line.weight = 0.5, ins.size = 2,
legend.symbol.size = 4),
plotFreqHeatmap.args = list(plot.text.size = 3, x.size = 8, group = group,
legend.text.size = 8,
legend.key.height = grid::unit(0.5, "lines"))) TableGrob (2 x 1) "arrange": 2 grobs
z cells name grob
1 1 (1-1,1-1) arrange rect[GRID.rect.11]
2 2 (2-2,1-1) arrange gtable[arrange]
pdf("variant_plot.pdf", width = 8, height = 6)
plotVariants(crispr_set)TableGrob (2 x 1) "arrange": 2 grobs
z cells name grob
1 1 (1-1,1-1) arrange rect[GRID.rect.132]
2 2 (2-2,1-1) arrange gtable[arrange]
dev.off()quartz_off_screen
2
eff <- mutationEfficiency(crispr_set, filter.cols = "ControlDHS15primers", exclude.cols = "ControlDHS15primers")
effControlDHS2primers TestDHS15primers Average Median
2.15 3.34 2.75 2.75
Overall StDev ReadCount
3.34 3621.80 154247.00
crispr_set_rev <- readsToTarget(sorted_bam_files, target = target_location, reference = reference,
names = bam_sample_names, target.loc = params$target_loc_reverse,
orientation = "opposite")
plotVariants(crispr_set_rev)TableGrob (2 x 1) "arrange": 2 grobs
z cells name grob
1 1 (1-1,1-1) arrange rect[GRID.rect.253]
2 2 (2-2,1-1) arrange gtable[arrange]
# Calculate frequency of each variant
variant_freq <- variantFrequency(crispr_set, filter.cols = "ControlDHS15primers", exclude.cols = "ControlDHS15primers")
datatable(variant_freq)quartoThe code in quarto document format is included below which can be used to render and rerun this analysis workflow in its entirety. It can be rendered with the quarto publishing system running in the R programming environment in the renv section. You can download the file using the button below.
renvThis project uses renv to manage package dependencies. renv is an R package that helps create and manage project-specific environments, ensuring that the correct versions of packages are used for reproducibility. To restore the environment, you would typically run renv::restore() in your R session, which will install the packages specified in the renv.lock file. You can download the renv.lock file below to restore the environment.
The current status of the environment is as follows:
renv::status()No issues found -- the project is in a consistent state.
sessionInfo()R version 4.4.2 (2024-10-31)
Platform: aarch64-apple-darwin20
Running under: macOS Sequoia 15.5
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: America/Denver
tzcode source: internal
attached base packages:
[1] stats4 stats graphics grDevices datasets utils methods
[8] base
other attached packages:
[1] gridExtra_2.3 DT_0.33 GenomicFeatures_1.58.0
[4] AnnotationDbi_1.68.0 Biobase_2.66.0 Rsamtools_2.22.0
[7] Biostrings_2.74.1 XVector_0.46.0 GenomicRanges_1.58.0
[10] GenomeInfoDb_1.42.3 IRanges_2.40.1 S4Vectors_0.44.0
[13] BiocGenerics_0.52.0 CrispRVariants_1.34.0 ggplot2_3.5.2
loaded via a namespace (and not attached):
[1] tidyselect_1.2.1 dplyr_1.1.4
[3] farver_2.1.2 blob_1.2.4
[5] bitops_1.0-9 fastmap_1.2.0
[7] RCurl_1.98-1.17 GenomicAlignments_1.42.0
[9] XML_3.99-0.18 digest_0.6.36
[11] lifecycle_1.0.4 KEGGREST_1.46.0
[13] RSQLite_2.4.2 magrittr_2.0.3
[15] compiler_4.4.2 sass_0.4.9
[17] rlang_1.1.4 tools_4.4.2
[19] yaml_2.3.9 rtracklayer_1.66.0
[21] knitr_1.48 labeling_0.4.3
[23] S4Arrays_1.6.0 htmlwidgets_1.6.4
[25] bit_4.6.0 curl_6.4.0
[27] DelayedArray_0.32.0 plyr_1.8.9
[29] RColorBrewer_1.1-3 abind_1.4-8
[31] BiocParallel_1.40.2 withr_3.0.0
[33] grid_4.4.2 scales_1.4.0
[35] SummarizedExperiment_1.36.0 cli_3.6.3
[37] rmarkdown_2.27 crayon_1.5.3
[39] generics_0.1.4 httr_1.4.7
[41] reshape2_1.4.4 rjson_0.2.23
[43] DBI_1.2.3 cachem_1.1.0
[45] stringr_1.5.1 zlibbioc_1.52.0
[47] parallel_4.4.2 BiocManager_1.30.26
[49] restfulr_0.0.16 base64enc_0.1-3
[51] matrixStats_1.5.0 vctrs_0.6.5
[53] Matrix_1.7-1 jsonlite_1.8.8
[55] bit64_4.6.0-1 crosstalk_1.2.1
[57] jquerylib_0.1.4 glue_1.7.0
[59] codetools_0.2-20 stringi_1.8.7
[61] gtable_0.3.6 BiocIO_1.16.0
[63] UCSC.utils_1.2.0 tibble_3.3.0
[65] pillar_1.11.0 htmltools_0.5.8.1
[67] GenomeInfoDbData_1.2.13 R6_2.5.1
[69] evaluate_0.24.0 lattice_0.22-6
[71] png_0.1-8 memoise_2.0.1
[73] bslib_0.7.0 renv_1.1.4
[75] Rcpp_1.0.12 SparseArray_1.6.2
[77] xfun_0.45 MatrixGenerics_1.18.1
[79] pkgconfig_2.0.3
Document render completed at: 2025-07-31 10:04:31
Total render time: 1.24 minutes